import os
import mysql.connector
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, date
from os import getcwd, path
import plotly.express as px
import plotly.offline as pyo
pyo.init_notebook_mode()
import seaborn as sns
plt.style.use("seaborn")
sns.set_style("whitegrid", {'axes.grid' : False})
#Working directory
path = r"C:\Users\Usuario\Documents\2. Programas\GitHub\Data and programming II\final-project-final-project-adriana-juan"
#Opening MySQL access and find the dataset:
mydb = mysql.connector.connect(
host = "35.214.185.93",
user = "uqjzxtryqrr5g",
password = "qz0xs0lucobl"
)
mycursor = mydb.cursor()
mycursor.execute("SHOW DATABASES")
for x in mycursor:
print(x)
('information_schema',)
('dbhfgcileg5zjx',)
#Show tables:
mydb = mysql.connector.connect(
host = "35.214.185.93",
user = "uqjzxtryqrr5g",
password = "qz0xs0lucobl",
database = "dbhfgcileg5zjx"
)
dbhfgcileg5zjx_tables = pd.read_sql_query("SHOW TABLES FROM dbhfgcileg5zjx", mydb)
dbhfgcileg5zjx_tables
| Tables_in_dbhfgcileg5zjx | |
|---|---|
| 0 | corona_details |
| 1 | demographic_info |
| 2 | education |
| 3 | education_type |
| 4 | employment |
| 5 | graduates_followup |
| 6 | lessons_followup |
| 7 | lessons_mapping |
| 8 | users |
#Opening each table. Each table was one part of the survey the organization did. Each part was not applied at the same time:
mydb = mysql.connector.connect(
host = "35.214.185.93",
user = "uqjzxtryqrr5g",
password = "qz0xs0lucobl",
database = "dbhfgcileg5zjx"
)
corona_table = "SELECT * FROM corona_details"
demographic_table = "SELECT * FROM demographic_info"
education_table = "SELECT * FROM education"
education_type_table = "SELECT * FROM education_type"
employment_table = "SELECT * FROM employment"
graduates_table = "SELECT * FROM graduates_followup"
lessons_followup_table = "SELECT * FROM lessons_followup"
lessons_mapping_table = "SELECT * FROM lessons_mapping"
users_table = "SELECT * FROM users"
#Lets start with the Users table to understand how they present each user:
df_users = pd.read_sql_query(users_table, mydb)
df_users["submit_date"] = pd.to_datetime(df_users["submit_date"], unit="s")
df_users
#We can see that une user its shown as shecodes_id and userid_connect. It was mentioned during a meeting. In some tables
#they present one user with shecodes_id and in others with userid_connect. For example, "Maria" can be shecodes_id = 5
#and userid_connect=7
| shecodes_id | id | userid_connect | branch_id | how_did_you_hear | where_I_heard_about_shecodes | why_shecodes | entry_date | submit_date | if_in_team | branch_name | short_name | city | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 5 | 7 | 12.0 | 3 | None | 19 | 11/11/2021 | 2021-11-11 08:08:36 | team | Applied Materials | applied | 2.0 |
| 1 | 6 | 6 | 8 | 13.0 | None | None | None | 31/12/2017 | NaT | team | IBM | IBM | 16.0 |
| 2 | 9 | 9 | 11 | 13.0 | 1, 2, 3, 4 | None | 5, 6, 7, 8, 9, 10, 13 | 31/12/2017 | NaT | not_team | IBM | IBM | 16.0 |
| 3 | 12 | 12 | 14 | 30.0 | None | None | None | 08/07/2018 | NaT | team | Intel Jerusalem | Intel jlm | 8.0 |
| 4 | 15 | 15 | 17 | 34.0 | None | None | None | None | NaT | team | Ben Gurion University - Mon | BGU Mon | 13.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 11605 | 32558 | 32558 | 28024 | 7.0 | 9 | None | 4, 19 | 29/10/2021 | 2021-10-29 18:46:40 | not_team | Check Point | checkpoint | 1.0 |
| 11606 | 32573 | 32573 | 28040 | 73.0 | 7 | Joytunes | 20 | 30/10/2021 | 2021-10-30 15:02:58 | not_team | Monday Eve A Virtual | online branch | NaN |
| 11607 | 32586 | 32586 | 28048 | 9.0 | 2 | None | 4, 19, 999 | 30/10/2021 | 2021-10-30 18:43:34 | not_team | Google Campus Hacknight | google hacknight | 1.0 |
| 11608 | 32624 | 32624 | 28076 | 9.0 | 1 | None | 4, 19 | 31/10/2021 | 2021-10-31 13:55:54 | not_team | Google Campus Hacknight | google hacknight | 1.0 |
| 11609 | 32644 | 32644 | 28085 | 52.0 | 5 | None | 19 | 31/10/2021 | 2021-10-31 18:36:45 | not_team | Ebay | ebay | 7.0 |
11610 rows × 13 columns
#Lets start with the Corona table:
df_corona = pd.read_sql_query(corona_table, mydb)
df_corona
#Here, it is shown users with shecodes_id. However, we observe that are repeated. This is because of how the survey was made.
#Big problem through all the project trying to map users.
#For the Corona survey, we defined Coronavirus victims: an user who at least once was affected by COVID-19. It means that
#if one user strugguled to find a job after her BA degree (because of COVID), but later she said that she could find a job,
#we say she is a COVID-19 victim.
#We created the corona_victim column where it is 1 if at least once she reported being affected by COVID.
#Then, we cleaned users remaining the once if at least were affected by COVID-19.
| id | shecodes_id | after_BA_work_difficulties | not_working_because_corona | working_terms_got_worse | hard_to_find_work | submit_date | |
|---|---|---|---|---|---|---|---|
| 0 | 14 | 17190 | 1.0 | 1 | None | None | 2021-02-26 05:26:21 |
| 1 | 15 | 17190 | 0.0 | 1 | None | None | 2021-02-26 06:07:24 |
| 2 | 16 | 17190 | 0.0 | 1 | None | None | 2021-02-26 09:49:09 |
| 3 | 22 | 28481 | 0.0 | None | None | None | 2021-03-03 10:12:56 |
| 4 | 24 | 11625 | 0.0 | None | None | None | 2021-03-03 10:30:52 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 3576 | 9093 | 32349 | NaN | None | None | None | 2021-11-22 12:30:20 |
| 3577 | 9094 | 32480 | NaN | None | None | None | 2021-11-22 12:31:24 |
| 3578 | 9095 | 32482 | NaN | None | None | None | 2021-11-22 12:35:58 |
| 3579 | 9096 | 32361 | NaN | None | None | None | 2021-11-22 12:38:03 |
| 3580 | 9098 | 32361 | NaN | None | None | None | 2021-11-22 12:42:11 |
3581 rows × 7 columns
#Changing format to numeric:
df_corona["after_BA_work_difficulties"] = pd.to_numeric(df_corona["after_BA_work_difficulties"])
df_corona["not_working_because_corona"] = pd.to_numeric(df_corona["not_working_because_corona"])
df_corona["working_terms_got_worse"] = pd.to_numeric(df_corona["working_terms_got_worse"])
df_corona["hard_to_find_work"] = pd.to_numeric(df_corona["hard_to_find_work"])
print (df_corona.dtypes)
id int64 shecodes_id int64 after_BA_work_difficulties float64 not_working_because_corona float64 working_terms_got_worse float64 hard_to_find_work float64 submit_date object dtype: object
for col in df_corona.columns:
print(col)
id shecodes_id after_BA_work_difficulties not_working_because_corona working_terms_got_worse hard_to_find_work submit_date
#Defining corona victim and cleaning the data to work with 1 user:
def conditions(s):
if (s['after_BA_work_difficulties'] > 0) or (s['not_working_because_corona'] > 0) or (s['working_terms_got_worse'] > 0) or (s['hard_to_find_work'] > 0):
return 1
else:
return 0
df_corona['corona_victim'] = df_corona.apply(conditions, axis=1)
df_corona_victim = df_corona.sort_values('corona_victim',ascending=True).drop_duplicates('shecodes_id',keep='last')
df_corona_victim
| id | shecodes_id | after_BA_work_difficulties | not_working_because_corona | working_terms_got_worse | hard_to_find_work | submit_date | corona_victim | |
|---|---|---|---|---|---|---|---|---|
| 1897 | 4809 | 22119 | NaN | NaN | 0.0 | 0.0 | 2021-08-23 15:52:42 | 0 |
| 1899 | 4812 | 27907 | NaN | NaN | 0.0 | 0.0 | 2021-08-23 15:54:44 | 0 |
| 1901 | 4817 | 18568 | NaN | NaN | 0.0 | 0.0 | 2021-08-23 15:59:19 | 0 |
| 1896 | 4808 | 18196 | NaN | NaN | 0.0 | 0.0 | 2021-08-23 15:52:35 | 0 |
| 1912 | 4838 | 19165 | NaN | NaN | 0.0 | 0.0 | 2021-08-23 16:13:25 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2633 | 6104 | 28428 | NaN | NaN | 2.0 | 0.0 | 2021-09-17 17:06:14 | 1 |
| 2647 | 6124 | 24218 | NaN | NaN | 0.0 | 1.0 | 2021-09-19 15:27:50 | 1 |
| 2648 | 6125 | 28984 | NaN | NaN | 2.0 | 0.0 | 2021-09-19 15:35:14 | 1 |
| 2592 | 6039 | 31191 | NaN | NaN | 0.0 | 1.0 | 2021-09-12 16:31:00 | 1 |
| 1790 | 4614 | 30050 | NaN | NaN | 0.0 | 1.0 | 2021-08-23 14:40:40 | 1 |
2239 rows × 8 columns
#Calculating percentage of corona victims:
corona_counts = df_corona_victim["corona_victim"].value_counts().rename_axis("corona_victim").reset_index(name="counts")
corona_counts["percent"] = (corona_counts["counts"] /
corona_counts['counts'].sum()) * 100
corona_counts
| corona_victim | counts | percent | |
|---|---|---|---|
| 0 | 0 | 1667 | 74.452881 |
| 1 | 1 | 572 | 25.547119 |
Calculating "COVID victims"
#Plot percentage of COVID victims:
my_colors=["lightblue","lightgreen",
"silver","green"]
fig_1 = corona_counts.groupby(["corona_victim"]).sum().plot(kind="pie",
y="percent",
figsize=(6,6),colors=my_colors,
autopct="%1.1f%%",
explode=(0,0.15),
labels= ["COVID Victims", "No COVID Victims"],
ylabel="",
title = "SheCodes Users: percentage impacted by COVID-19")
fig_1.figure.savefig("fig_1.pdf")
fig_1
<AxesSubplot:title={'center':'SheCodes Users: percentage impacted by COVID-19'}>
Demographics
#Let's work now with Demographics:
mydb = mysql.connector.connect(
host = "35.214.185.93",
user = "uqjzxtryqrr5g",
password = "qz0xs0lucobl",
database = "dbhfgcileg5zjx"
)
demographic_table = "SELECT * FROM demographic_info"
df_demographic = pd.read_sql_query(demographic_table, mydb)
df_demographic["submit_date"] = pd.to_datetime(df_demographic["submit_date"], unit="s")
df_demographic
| id | shecodes_id | birthdate | family_status | kids_under_18 | kids_6-9 | kids_under_5 | birth_country | aliya_year | city_grow | city | sector | religios_level | is_army_or_ns | is_army_or_ns_tech_unit | is_army_or_ns_tech_position | entry_date | submit_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12502 | 6 | 7/8/1989 | 3.0 | None | None | None | None | None | None | None | 1 | 1.0 | None | None | None | 12/31/2017 | NaT |
| 1 | 11 | 74 | 19/12/1980 | 3.0 | 1 | None | None | None | None | None | 1127 | 1 | 1.0 | None | None | None | 06/03/2018 | NaT |
| 2 | 15 | 112 | 14/4/1989 | 1.0 | 0 | None | None | 1 | None | 1142 | 1248 | 1 | 1.0 | 2 | 0 | 0 | 11/01/2021 | 2021-02-19 22:26:39 |
| 3 | 11539 | 105 | 1/14/1988 | 3.0 | None | None | None | None | None | None | None | 1 | 1.0 | None | None | None | 7/24/2018 | NaT |
| 4 | 22578 | 110 | 25/09/1991 | 1.0 | 0 | None | None | 1 | None | 765 | 897 | 1 | 1.0 | 2 | 0 | 0 | 20/07/2021 | 2021-07-20 18:22:57 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9089 | 23615 | 31979 | 06/06/1993 | 1.0 | 0 | None | None | 1 | None | 1000 | 141 | 1 | 2.0 | 2 | 0 | 0 | 19/10/2021 | 2021-10-19 16:56:47 |
| 9090 | 23566 | 31930 | 10/03/1990 | 1.0 | 0 | None | None | 1 | None | 1248 | 1248 | 999 | NaN | 2 | 0 | 1 | 18/10/2021 | 2021-10-18 13:05:02 |
| 9091 | 22933 | 31313 | 30/09/1987 | 2.0 | 0 | None | None | 1 | None | 244 | 891 | 1 | 1.0 | 2 | 0 | 0 | 12/10/2021 | 2021-10-12 10:12:24 |
| 9092 | 23942 | 32295 | 03/09/1988 | 1.0 | 0 | None | None | 1 | None | 610 | 1248 | 1 | 1.0 | 2 | 1 | 1 | 26/10/2021 | 2021-10-26 18:01:50 |
| 9093 | 23539 | 31903 | 26/11/1998 | 1.0 | 0 | None | None | 1 | None | 817 | 817 | 1 | 2.0 | 1 | None | 0 | 17/10/2021 | 2021-10-17 21:41:46 |
9094 rows × 18 columns
#We had access to the metadata (but it was in Hebrew). We needed to open each .csv document in order to see what
#they meant with each column and to understand the code. We can see that they worked here with shecodes_id and that we
#didn't have duplicated users.
#Let's start with family_status:
df_family_status = "sc_family_status.csv"
df_family_status = pd.read_csv(os.path.join(path, df_family_status))
label_english = ["Single", "In a relationship", "Married", "Separated", "Divorcee", "Live separately", "Widow", "Not interested in answering"]
df_family_status["label_english"] = label_english
df_family_status
| label | value | salesforce_id | label_english | |
|---|---|---|---|---|
| 0 | רווקה | 1 | a0u4J000000HSKB | Single |
| 1 | במערכת יחסים | 2 | a0u4J000000HSKC | In a relationship |
| 2 | נשואה | 3 | a0u4J000000HSKD | Married |
| 3 | פרודה | 4 | a0u4J000000HSKE | Separated |
| 4 | גרושה | 5 | a0u4J000000HSKF | Divorcee |
| 5 | חיה בנפרד | 6 | a0u4J000000HSKG | Live separately |
| 6 | אלמנה | 7 | a0u4J000000HSKH | Widow |
| 7 | לא מעוניינת לענות | 999 | a0u4J000000HSKI | Not interested in answering |
family_counts = df_demographic["family_status"].value_counts().rename_axis("family_status").reset_index(name="counts")
family_counts["percent"] = (family_counts["counts"] /
family_counts["counts"].sum()) * 100
family_counts
| family_status | counts | percent | |
|---|---|---|---|
| 0 | 1.0 | 4484 | 49.448610 |
| 1 | 3.0 | 3011 | 33.204676 |
| 2 | 2.0 | 1055 | 11.634318 |
| 3 | 999.0 | 286 | 3.153948 |
| 4 | 5.0 | 200 | 2.205558 |
| 5 | 4.0 | 19 | 0.209528 |
| 6 | 7.0 | 8 | 0.088222 |
| 7 | 6.0 | 5 | 0.055139 |
fig_2 = family_counts[["percent"]].plot(kind='barh', title ="SheCodes Users: Family Status", figsize=(8, 10),
legend=True, fontsize=12, align='center', color='#86bf91', zorder=2, width=0.85)
sns.set_style("whitegrid", {'axes.grid' : False})
fig_2.set_xlabel("Percent (%)", fontsize=12, weight="bold", color="black")
fig_2.set_ylabel("Family Status", fontsize=12, weight="bold", color="black")
fig_2.set_title("SheCodes Users: Family Status", fontsize=14, color="black")
fig_2.set_yticklabels(["Single", "In a relationship", "Married", "Separated", "Divorcee",
"Live separately", "Widow", "Not interested in answering"])
fig_2.get_legend().remove()
plt.savefig("fig_2.pdf")
plt.show()
#Now, let's see birth_country:
df_birth_country = "sc_birth_country.csv"
df_birth_country = pd.read_csv(os.path.join(path, df_birth_country))
label_english_2 = ["Unknown", "Israel", "Eastern Europe and the former Brahms", "Western Europe", "Asia and the East", "Africa", "North America", "Central and South America", "Australia", "Middle East"]
df_birth_country["label_english_2"] = label_english_2
df_birth_country
| label | value | label_english_2 | |
|---|---|---|---|
| 0 | לא ידוע | 0 | Unknown |
| 1 | ישראל | 1 | Israel |
| 2 | מזרח אירופה ובריהמ לשעבר" | 2 | Eastern Europe and the former Brahms |
| 3 | מערב אירופה | 3 | Western Europe |
| 4 | אסיה והמזרח | 4 | Asia and the East |
| 5 | אפריקה | 5 | Africa |
| 6 | צפון אמריקה | 6 | North America |
| 7 | מרכז ודרום אמריקה | 7 | Central and South America |
| 8 | אוסטרליה | 8 | Australia |
| 9 | מזרח התיכון | 9 | Middle East |
country_counts = df_demographic["birth_country"].value_counts().rename_axis("birth_country").reset_index(name="counts")
country_counts["percent"] = (country_counts["counts"] /
country_counts["counts"].sum()) * 100
country_counts
| birth_country | counts | percent | |
|---|---|---|---|
| 0 | 1 | 5369 | 80.134328 |
| 1 | 2 | 836 | 12.477612 |
| 2 | 6 | 161 | 2.402985 |
| 3 | 3 | 144 | 2.149254 |
| 4 | 7 | 87 | 1.298507 |
| 5 | 5 | 37 | 0.552239 |
| 6 | 4 | 36 | 0.537313 |
| 7 | 8 | 15 | 0.223881 |
| 8 | 9 | 8 | 0.119403 |
| 9 | 0 | 7 | 0.104478 |
fig_3 = country_counts[["percent"]].plot(kind='barh', title ="SheCodes Users: Birth Country",
figsize=(8, 10), legend=True, fontsize=12, align='center', color='#86bf91', zorder=2, width=0.85)
sns.set_style("whitegrid", {'axes.grid' : False})
fig_3.set_xlabel("Percent (%)", fontsize=12, weight="bold", color="black")
fig_3.set_ylabel("Birth Country", fontsize=12, weight="bold", color="black")
fig_3.set_yticklabels(["Israel", "Eastern Europe and the former Brahms", "North America", "Western Europe", "Central and South America", "Africa", "Asia and the East", "Australia", "Middle East", "Unknown"])
fig_3.set_title("SheCodes Users: Birth Country", fontsize=14, color="black")
fig_3.get_legend().remove()
plt.savefig("fig_3.pdf")
plt.show()
#Let's see religious_level:
df_religious_level = "sc_religios_level_fields.csv"
df_religious_level = pd.read_csv(os.path.join(path, df_religious_level))
religious_level = ["Secular, not religious", "Traditional", "Religious", "Traditional - not so religious", "Traditional - religious", "Orthodox", "Haredi", "Not interested in answering", "Other"]
df_religious_level["religious_level"] = religious_level
df_religious_level
| label | value | religious_level | |
|---|---|---|---|
| 0 | חילונית, לא דתיה | 1 | Secular, not religious |
| 1 | מסורתית | 2 | Traditional |
| 2 | דתיה | 3 | Religious |
| 3 | מסורתית- לא כל כך דתיה | 4 | Traditional - not so religious |
| 4 | מסורתית- דתיה | 5 | Traditional - religious |
| 5 | אורתודוכסית | 6 | Orthodox |
| 6 | חרדית | 7 | Haredi |
| 7 | לא מעוניינת לענות | 999 | Not interested in answering |
| 8 | אחר | 9999 | Other |
religious_counts = df_demographic["religios_level"].value_counts().rename_axis("religious_level2").reset_index(name="counts")
religious_counts["percent"] = (religious_counts["counts"] / religious_counts["counts"].sum()) * 100
religious_counts
| religious_level2 | counts | percent | |
|---|---|---|---|
| 0 | 1.0 | 5269 | 65.355991 |
| 1 | 2.0 | 773 | 9.588192 |
| 2 | 999.0 | 612 | 7.591168 |
| 3 | 3.0 | 466 | 5.780203 |
| 4 | 7.0 | 284 | 3.522699 |
| 5 | 4.0 | 250 | 3.100968 |
| 6 | 6.0 | 230 | 2.852890 |
| 7 | 5.0 | 96 | 1.190772 |
| 8 | 9999.0 | 53 | 0.657405 |
| 9 | 0.0 | 29 | 0.359712 |
religious_counts = religious_counts.merge(df_religious_level, how="inner", left_on="religious_level2", right_on="value")
religious_counts
| religious_level2 | counts | percent | label | value | religious_level | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 5269 | 65.355991 | חילונית, לא דתיה | 1 | Secular, not religious |
| 1 | 2.0 | 773 | 9.588192 | מסורתית | 2 | Traditional |
| 2 | 999.0 | 612 | 7.591168 | לא מעוניינת לענות | 999 | Not interested in answering |
| 3 | 3.0 | 466 | 5.780203 | דתיה | 3 | Religious |
| 4 | 7.0 | 284 | 3.522699 | חרדית | 7 | Haredi |
| 5 | 4.0 | 250 | 3.100968 | מסורתית- לא כל כך דתיה | 4 | Traditional - not so religious |
| 6 | 6.0 | 230 | 2.852890 | אורתודוכסית | 6 | Orthodox |
| 7 | 5.0 | 96 | 1.190772 | מסורתית- דתיה | 5 | Traditional - religious |
| 8 | 9999.0 | 53 | 0.657405 | אחר | 9999 | Other |
fig_4 = sns.barplot(x = "religious_level",
y = "percent",
data = religious_counts,
palette = "Greens")
sns.set_style("whitegrid", {'axes.grid' : False})
fig_4.set_xticklabels(fig_4.get_xticklabels(), rotation=40, ha="right")
fig_4.set_title("SheCodes Users: Religious Level", fontsize=14, color="black")
fig_4.set_xlabel("Religious Level", fontsize=12, color="black")
fig_4.set_ylabel("Percent", fontsize=12, color="black")
plt.tight_layout()
plt.savefig("fig_4.pdf")
plt.show()
#Calculating Age:
df_demographic.dtypes
id int64 shecodes_id int64 birthdate object family_status float64 kids_under_18 object kids_6-9 object kids_under_5 object birth_country object aliya_year object city_grow object city object sector int64 religios_level float64 is_army_or_ns object is_army_or_ns_tech_unit object is_army_or_ns_tech_position object entry_date object submit_date datetime64[ns] dtype: object
df_demographic["birthdate"] = pd.to_datetime(df_demographic["birthdate"], errors="coerce")
df_demographic.dtypes
id int64 shecodes_id int64 birthdate datetime64[ns] family_status float64 kids_under_18 object kids_6-9 object kids_under_5 object birth_country object aliya_year object city_grow object city object sector int64 religios_level float64 is_army_or_ns object is_army_or_ns_tech_unit object is_army_or_ns_tech_position object entry_date object submit_date datetime64[ns] dtype: object
today = pd.to_datetime("today")
df_demographic["Age"] = today.year - df_demographic["birthdate"].dt.year
df_demographic
| id | shecodes_id | birthdate | family_status | kids_under_18 | kids_6-9 | kids_under_5 | birth_country | aliya_year | city_grow | city | sector | religios_level | is_army_or_ns | is_army_or_ns_tech_unit | is_army_or_ns_tech_position | entry_date | submit_date | Age | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12502 | 6 | 1989-07-08 | 3.0 | None | None | None | None | None | None | None | 1 | 1.0 | None | None | None | 12/31/2017 | NaT | 33.0 |
| 1 | 11 | 74 | 1980-12-19 | 3.0 | 1 | None | None | None | None | None | 1127 | 1 | 1.0 | None | None | None | 06/03/2018 | NaT | 42.0 |
| 2 | 15 | 112 | 1989-04-14 | 1.0 | 0 | None | None | 1 | None | 1142 | 1248 | 1 | 1.0 | 2 | 0 | 0 | 11/01/2021 | 2021-02-19 22:26:39 | 33.0 |
| 3 | 11539 | 105 | 1988-01-14 | 3.0 | None | None | None | None | None | None | None | 1 | 1.0 | None | None | None | 7/24/2018 | NaT | 34.0 |
| 4 | 22578 | 110 | 1991-09-25 | 1.0 | 0 | None | None | 1 | None | 765 | 897 | 1 | 1.0 | 2 | 0 | 0 | 20/07/2021 | 2021-07-20 18:22:57 | 31.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9089 | 23615 | 31979 | 1993-06-06 | 1.0 | 0 | None | None | 1 | None | 1000 | 141 | 1 | 2.0 | 2 | 0 | 0 | 19/10/2021 | 2021-10-19 16:56:47 | 29.0 |
| 9090 | 23566 | 31930 | 1990-10-03 | 1.0 | 0 | None | None | 1 | None | 1248 | 1248 | 999 | NaN | 2 | 0 | 1 | 18/10/2021 | 2021-10-18 13:05:02 | 32.0 |
| 9091 | 22933 | 31313 | 1987-09-30 | 2.0 | 0 | None | None | 1 | None | 244 | 891 | 1 | 1.0 | 2 | 0 | 0 | 12/10/2021 | 2021-10-12 10:12:24 | 35.0 |
| 9092 | 23942 | 32295 | 1988-03-09 | 1.0 | 0 | None | None | 1 | None | 610 | 1248 | 1 | 1.0 | 2 | 1 | 1 | 26/10/2021 | 2021-10-26 18:01:50 | 34.0 |
| 9093 | 23539 | 31903 | 1998-11-26 | 1.0 | 0 | None | None | 1 | None | 817 | 817 | 1 | 2.0 | 1 | None | 0 | 17/10/2021 | 2021-10-17 21:41:46 | 24.0 |
9094 rows × 19 columns
#Understanding Age of Users: let's start with some exploratory data analysis.
#Source: https://medium.com/nerd-for-tech/analyse-the-distribution-of-ages-python-data-analysis-series-part-1-cc0fb2ca7f36
fig_5 = px.scatter(df_demographic, y="Age", color_discrete_sequence =['green']*len(df_demographic))
fig_5.update_layout({"plot_bgcolor": "rgba(03, 0, 0, 0)",
"paper_bgcolor": "rgba(0, 0, 0, 0)",})
fig_5.show()
#Because we saw that there are some outliers (maybe errors from users when filled the survey), we filter the age:
df_demographic = df_demographic.query("(Age >= 10) and (Age <= 80)")
fig_6 = px.scatter(df_demographic, y="Age", color_discrete_sequence =['green']*len(df_demographic))
fig_6.update_layout({"plot_bgcolor": "rgba(0, 0, 0, 0)",
"paper_bgcolor": "rgba(0, 0, 0, 0)",})
fig_6.show()
#Now that we have a distribution that seems right, lets see the age of our users in a better graph:
age_counts = df_demographic['Age'].value_counts()
fig_7 = px.bar(age_counts, title="SheCodes users: Age", color_discrete_sequence =['green']*len(age_counts))
fig_7.update_layout(
xaxis_title = "Age",
yaxis_title = "Frequency",
title_x = 0.5,
showlegend = False)
fig_7.update_layout({"plot_bgcolor": "rgba(0, 0, 0, 0)",
"paper_bgcolor": "rgba(0, 0, 0, 0)",})
fig_7.write_image("fig_5.pdf")
fig_7.show()
#We can see the distribution of user's age in the following graph (with a concentration
#around 36 years old)
#Let's start working with Education, Employment, and Graduates
connection = mysql.connector.connect(host = '35.214.185.93',
user = 'uqjzxtryqrr5g',
password = 'qz0xs0lucobl',
db = 'dbhfgcileg5zjx')
tables = pd.read_sql_query('SHOW TABLES FROM dbhfgcileg5zjx', connection)
employment_table = 'SELECT * FROM employment'
education_table = 'SELECT * FROM education'
education_type_table = 'SELECT * FROM education_type'
users_table = 'SELECT * FROM users'
graduates_table = 'SELECT * FROM graduates_followup'
df_employment = pd.read_sql_query(employment_table, connection)
df_education = pd.read_sql_query(education_table, connection)
df_education_type = pd.read_sql_query(education_type_table, connection)
df_users = pd.read_sql_query(users_table, connection)
df_graduates = pd.read_sql_query(graduates_table, connection)
df_education
| id | shecodes_id | type | diploma_status | institute | start_year | end_year | first_discipline | first_faculty | second_discipline | second_faculty | course_name | entry_date | start_date | end_date | submit_date | studies_after_shecodes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7261 | 6 | 3.0 | 13.0 | 3 | 0.0 | 2015.0 | 0.0 | 2000 | 0 | 0.0 | None | 31/12/2017 | None | None | None | None |
| 1 | 1562 | 112 | 3.0 | 13.0 | 3 | 2012.0 | 2015.0 | 614.0 | 280 | 0 | 810.0 | None | 30/06/2019 | None | None | None | None |
| 2 | 7901 | 112 | 3.0 | 23.0 | 2 | 2015.0 | 2017.0 | 614.0 | 810 | 0 | 280.0 | None | 30/06/2019 | None | None | None | None |
| 3 | 11930 | 112 | 1.0 | 2.0 | None | 0.0 | 0.0 | 13.0 | None | 0 | 0.0 | None | 30/06/2019 | None | None | None | None |
| 4 | 48054 | 112 | 3.0 | 13.0 | 3 | 2012.0 | 2015.0 | 9000.0 | 614 | None | 203.0 | None | 14/10/2020 16:16:44 | 2012-10-01 | 2015-08-01 | 1602692201 | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 28700 | 66113 | 31313 | 3.0 | 13.0 | 2 | 2010.0 | 2015.0 | 6036.0 | 6006 | None | NaN | None | 12/10/2021 10:12:24 | 2010-10-24 | 2015-03-25 | 1634033544 | None |
| 28701 | 66114 | 31313 | 1.0 | 2.0 | None | NaN | NaN | 33.0 | None | None | NaN | None | 12/10/2021 10:12:24 | None | None | 1634033544 | None |
| 28702 | 69876 | 32295 | 2.0 | 3.0 | 999 | 2007.0 | 2009.0 | 19.0 | None | None | NaN | None | 26/10/2021 18:01:50 | 2007-03-11 | 2009-06-14 | 1635271310 | None |
| 28703 | 69877 | 32295 | 1.0 | 2.0 | None | NaN | NaN | 13.0 | None | None | NaN | None | 26/10/2021 18:01:50 | None | None | 1635271310 | None |
| 28704 | 68209 | 31903 | 1.0 | 2.0 | None | NaN | NaN | 13.0 | None | None | NaN | None | 17/10/2021 21:41:46 | None | None | 1634506906 | None |
28705 rows × 17 columns
## Solution for education date
df_graduates['date_enrolled'] = pd.to_datetime(df_graduates['date_enrolled'], unit='s')
df_graduates['date_graduated'] = pd.to_datetime(df_graduates['date_graduated'], unit='s')
## Organizing education type 3
df_1 = df_education.loc[df_education["type"] == 3]
df_1 = df_1.sort_values(by = "shecodes_id")
df_1 = df_1.sort_values('entry_date').drop_duplicates('shecodes_id',keep='last')
df_2 = df_education.loc[df_education["type"] == 2]
df_2 = df_2.sort_values(by = "shecodes_id")
df_2 = df_2.sort_values('entry_date').drop_duplicates('shecodes_id',keep='last')
df_2
| id | shecodes_id | type | diploma_status | institute | start_year | end_year | first_discipline | first_faculty | second_discipline | second_faculty | course_name | entry_date | start_date | end_date | submit_date | studies_after_shecodes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8154 | 10681 | 15799 | 2.0 | 3.0 | 71 | 2018.0 | 0.0 | 0.0 | None | 0 | 0.0 | None | 01/01/2019 | None | None | None | None |
| 1612 | 10049 | 6775 | 2.0 | 3.0 | 8 | 2015.0 | 0.0 | 0.0 | None | 0 | 0.0 | None | 01/01/2019 | None | None | None | None |
| 7658 | 10669 | 15171 | 2.0 | 3.0 | 64 | 2015.0 | 0.0 | 0.0 | None | 0 | 0.0 | None | 01/01/2019 | None | None | None | None |
| 7980 | 10053 | 15569 | 2.0 | 3.0 | 8 | 1991.0 | 0.0 | 0.0 | None | 0 | 0.0 | None | 01/01/2019 | None | None | None | None |
| 8581 | 10623 | 15507 | 2.0 | 3.0 | 36 | 2016.0 | 0.0 | 0.0 | None | 0 | 0.0 | None | 01/01/2019 | None | None | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 26464 | 51847 | 27331 | 2.0 | 3.0 | 9 | 2018.0 | 2019.0 | 3.0 | None | None | NaN | None | 31/12/2020 05:49:33 | 2018-12-16 | 2019-12-16 | 1609393771 | None |
| 22914 | 51872 | 25231 | 2.0 | 2.0 | 8 | 2020.0 | 2021.0 | 3.0 | None | None | NaN | None | 31/12/2020 08:14:30 | 2020-09-01 | 2021-04-01 | 1609402467 | None |
| 23380 | 51908 | 25574 | 2.0 | 3.0 | 999 | 2020.0 | 2020.0 | 19.0 | None | None | NaN | None | 31/12/2020 09:14:19 | 2020-02-01 | 2020-07-31 | 1609406057 | None |
| 23753 | 52014 | 27391 | 2.0 | 3.0 | 999 | 2011.0 | 2013.0 | 12.0 | None | None | NaN | None | 31/12/2020 14:26:30 | 2011-04-01 | 2013-03-31 | 1609424788 | None |
| 23455 | 52057 | 27406 | 2.0 | 3.0 | 999 | 2016.0 | 2016.0 | 8.0 | None | None | NaN | None | 31/12/2020 17:43:29 | 2016-01-01 | 2016-07-28 | 1609436608 | None |
2177 rows × 17 columns
df_3 = df_education.loc[df_education["type"] == 1]
df_3 = df_3.sort_values(by = "shecodes_id")
df_3 = df_3.sort_values('entry_date').drop_duplicates('shecodes_id',keep='last')
df_3
| id | shecodes_id | type | diploma_status | institute | start_year | end_year | first_discipline | first_faculty | second_discipline | second_faculty | course_name | entry_date | start_date | end_date | submit_date | studies_after_shecodes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2116 | 14641 | 8113 | 1.0 | NaN | None | 0.0 | 0.0 | 42.0 | None | 0 | 0.0 | None | 01/01/2018 | None | None | None | None |
| 7536 | 13751 | 8105 | 1.0 | NaN | None | 0.0 | 0.0 | 2.0 | None | 0 | 0.0 | None | 01/01/2018 | None | None | None | None |
| 3112 | 13748 | 8092 | 1.0 | NaN | None | 0.0 | 0.0 | 2.0 | None | 0 | 0.0 | None | 01/01/2018 | None | None | None | None |
| 2113 | 13762 | 8044 | 1.0 | NaN | None | 0.0 | 0.0 | 2.0 | None | 0 | 0.0 | None | 01/01/2018 | None | None | None | None |
| 2095 | 13755 | 8037 | 1.0 | NaN | None | 0.0 | 0.0 | 2.0 | None | 0 | 0.0 | None | 01/01/2018 | None | None | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 5699 | 52056 | 13089 | 1.0 | 2.0 | None | NaN | NaN | 47.0 | None | None | NaN | None | 31/12/2020 17:42:04 | None | None | 1609436517 | None |
| 23456 | 52058 | 27406 | 1.0 | 2.0 | None | NaN | NaN | 13.0 | None | None | NaN | None | 31/12/2020 17:43:30 | None | None | 1609436608 | None |
| 23818 | 52068 | 27410 | 1.0 | 2.0 | None | NaN | NaN | 76.0 | None | None | NaN | None | 31/12/2020 18:17:55 | None | None | 1609438672 | None |
| 23569 | 52070 | 27411 | 1.0 | 2.0 | None | NaN | NaN | 33.0 | None | None | NaN | None | 31/12/2020 19:09:51 | None | None | 1609441789 | None |
| 23392 | 52079 | 27415 | 1.0 | 2.0 | None | NaN | NaN | 33.0 | None | None | NaN | None | 31/12/2020 20:12:48 | None | None | 1609445565 | None |
7160 rows × 17 columns
df_4 = df_education.loc[df_education["type"] == 4]
df_4 = df_4.sort_values(by = "shecodes_id")
df_4 = df_4.sort_values('entry_date').drop_duplicates('shecodes_id',keep='last')
df_4
| id | shecodes_id | type | diploma_status | institute | start_year | end_year | first_discipline | first_faculty | second_discipline | second_faculty | course_name | entry_date | start_date | end_date | submit_date | studies_after_shecodes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 23505 | 52094 | 27420 | 4.0 | 1.0 | None | 2012.0 | 2013.0 | NaN | None | None | NaN | רואת חשבון | 01/01/2021 09:15:02 | 2012-01-01 | 2013-01-01 | 1609492498 | None |
| 23462 | 52097 | 27421 | 4.0 | 1.0 | None | 2013.0 | 2015.0 | NaN | None | None | NaN | רואת חשבון | 01/01/2021 09:15:05 | 2013-09-01 | 2015-12-31 | 1609492498 | None |
| 22860 | 52100 | 27422 | 4.0 | 1.0 | None | 2019.0 | 2019.0 | NaN | None | None | NaN | קופירייטינג ושיווק דיגיטלי | 01/01/2021 09:24:35 | 2019-01-01 | 2019-05-01 | 1609493072 | None |
| 23374 | 52156 | 27442 | 4.0 | 1.0 | None | 2018.0 | 2019.0 | NaN | None | None | NaN | קאוצ'ינג באונ' חיפה | 01/01/2021 17:20:49 | 2018-11-08 | 2019-07-26 | 1609521646 | None |
| 26121 | 34162 | 23245 | 4.0 | 1.0 | None | 2017.0 | 2019.0 | NaN | None | None | NaN | מנהלת חשבונות | 01/03/2020 00:11:29 | 2017-12-22 | 2019-12-03 | None | None |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 22386 | 39008 | 23879 | 4.0 | 1.0 | None | 2012.0 | 2013.0 | NaN | None | None | NaN | ספורט-תרפיסטית | None | 2012-10-02 | 2013-07-30 | 1588668996 | None |
| 17288 | 39002 | 23905 | 4.0 | 1.0 | None | 2003.0 | 2007.0 | NaN | None | None | NaN | קולנוע מביה"ס סם שפיגל | None | 2003-10-01 | 2007-05-31 | 1588668996 | None |
| 21236 | 39000 | 23907 | 4.0 | 1.0 | None | 2015.0 | 2020.0 | NaN | None | None | NaN | משחק | None | 2015-10-13 | 2020-08-01 | 1588668996 | None |
| 16995 | 38994 | 23934 | 4.0 | 1.0 | None | 1991.0 | 1991.0 | NaN | None | None | NaN | קורס תכנות COBOL ו- BASIC, בתוספת תרשים זרימה | None | 1991-08-05 | 1991-12-20 | 1588668996 | None |
| 18811 | 38991 | 23955 | 4.0 | 1.0 | None | 2017.0 | 2018.0 | NaN | None | None | NaN | אימון אישי - קאוצרית | None | 2017-07-16 | 2018-04-22 | 1588668996 | None |
594 rows × 17 columns
frames = [df_1, df_2, df_3, df_4]
result = pd.concat(frames)
result_count = result["shecodes_id"].value_counts()
df_education_fin = pd.get_dummies(result, columns=['type']).groupby(['shecodes_id'], as_index=False).sum()
df_education_fin
| shecodes_id | id | diploma_status | start_year | end_year | first_discipline | second_faculty | type_1.0 | type_2.0 | type_3.0 | type_4.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 263812 | 27.0 | 6046.0 | 6022.0 | 8014.0 | 0.0 | 1 | 1 | 1 | 1 |
| 1 | 6 | 7261 | 13.0 | 0.0 | 2015.0 | 0.0 | 0.0 | 0 | 0 | 1 | 0 |
| 2 | 9 | 15364 | 0.0 | 0.0 | 0.0 | 111.0 | 0.0 | 1 | 0 | 0 | 0 |
| 3 | 12 | 7229 | 13.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0 | 0 | 1 | 0 |
| 4 | 21 | 92063 | 14.0 | 2020.0 | 2023.0 | 9013.0 | 0.0 | 1 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8632 | 32558 | 140615 | 3.0 | 2016.0 | 2020.0 | 116.0 | 0.0 | 1 | 0 | 0 | 1 |
| 8633 | 32573 | 140761 | 25.0 | 2008.0 | 2015.0 | 9012.0 | 0.0 | 1 | 0 | 1 | 0 |
| 8634 | 32586 | 70429 | 2.0 | 0.0 | 0.0 | 27.0 | 0.0 | 1 | 0 | 0 | 0 |
| 8635 | 32624 | 141129 | 14.0 | 2019.0 | 2022.0 | 9076.0 | 624.0 | 1 | 0 | 1 | 0 |
| 8636 | 32644 | 141345 | 15.0 | 2010.0 | 2014.0 | 9013.0 | 0.0 | 1 | 0 | 1 | 0 |
8637 rows × 11 columns
df_employment
| id | shecodes_id | currently_employed | position_scope | is_tech_company | company_name | is_tech_position | start_job_date | end_job_date | employment_domain | employment_role | job_rank | seniority | company_type | not_working_reason | entry_date | submit_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 6 | 1.0 | 3 | 1.0 | 1.0 | None | None | 11 | None | None | 5 | 7 | None | 31/12/2017 | None | |
| 1 | 19 | 74 | 0.0 | None | 0.0 | 0.0 | None | None | None | None | None | None | None | None | None | ||
| 2 | 28 | 112 | 0.0 | None | NaN | NaN | None | None | None | None | None | None | None | None | 30/06/2019 | None | |
| 3 | 24321 | 112 | 0.0 | 3 | 0.0 | משרד רה״מ | 0.0 | 13/02/2020 | None | 19 | None | None | 3 | 4 | None | 14/10/2020 | 1602692201 |
| 4 | 24322 | 112 | 0.0 | 3 | 1.0 | TASC | 0.0 | 21/07/2019 | 09/02/2020 | 24 | None | None | 4 | 1 | None | 14/10/2020 | 1602692201 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 18163 | 40717 | 31313 | 0.0 | 3 | 0.0 | אקולוג | 0.0 | 13/03/2016 | 21/01/2018 | 11 | None | None | 4 | 1 | None | 12/10/2021 | 1634033544 |
| 18164 | 44449 | 32295 | 1.0 | 3 | 1.0 | - | 1.0 | 23/12/2019 | None | 4 | 999 | 6 | 5 | 5 | None | 26/10/2021 | 1635271310 |
| 18165 | 44450 | 32295 | 0.0 | None | NaN | None | NaN | None | None | None | None | None | None | None | None | 26/10/2021 | 1635271310 |
| 18166 | 42522 | 31903 | 0.0 | None | NaN | None | NaN | None | None | None | None | None | None | None | 999 | 17/10/2021 | 1634506906 |
| 18167 | 42523 | 31903 | 0.0 | None | NaN | None | NaN | None | None | None | None | None | None | None | None | 17/10/2021 | 1634506906 |
18168 rows × 17 columns
df_employment = df_employment.sort_values('entry_date').drop_duplicates('shecodes_id', keep='last')
df_employment.groupby(by = 'currently_employed')["shecodes_id"].count()
currently_employed 0.0 4757 1.0 4221 2.0 42 Name: shecodes_id, dtype: int64
df_employment
| id | shecodes_id | currently_employed | position_scope | is_tech_company | company_name | is_tech_position | start_job_date | end_job_date | employment_domain | employment_role | job_rank | seniority | company_type | not_working_reason | entry_date | submit_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6177 | 9139 | 16507 | 0.0 | None | NaN | NaN | None | None | None | None | None | None | None | None | None | ||
| 4173 | 180 | 1004 | 1.0 | 4 | 1.0 | תעשייה אווירית | 1.0 | None | None | 35 | 1 | None | 4 | 7 | None | None | |
| 4176 | 5981 | 12617 | 0.0 | None | 0.0 | 0.0 | None | None | None | None | None | None | None | None | None | ||
| 4181 | 4724 | 11220 | 1.0 | 3 | 0.0 | צהל | 0.0 | None | None | None | None | None | 5 | 4 | None | None | |
| 4183 | 7099 | 13898 | 1.0 | None | NaN | NaN | None | None | None | None | None | None | None | None | None | ||
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1343 | 2075 | 8213 | 0.0 | None | 0.0 | 0.0 | None | None | None | None | None | None | None | None | 9/1/2018 | None | |
| 1909 | 3291 | 9514 | 1.0 | 3 | 0.0 | אלביט | 0.0 | None | None | 11 | None | None | 5 | 7 | None | 9/1/2018 | None |
| 1918 | 3307 | 9531 | 1.0 | 3 | 0.0 | פירון חיפה חברת עורכי דין | 0.0 | None | None | 3 | None | None | 6 | 7 | None | 9/1/2018 | None |
| 925 | 1193 | 6617 | 1.0 | 3 | 0.0 | הדרכה ישירה | 0.0 | None | None | 18 | None | None | 5 | 2 | None | 9/1/2018 | None |
| 759 | 318 | 2053 | 1.0 | 5 | NaN | NaN | None | None | None | None | None | None | None | None | None | None |
9092 rows × 17 columns
df_graduates
| id | userid_connect | track_id_connect | badge_id | graduation_type | date_enrolled | date_graduated | |
|---|---|---|---|---|---|---|---|
| 0 | 2365 | 1343 | 11 | 21 | 4 | 2018-04-08 16:07:21 | 2018-12-02 21:07:55 |
| 1 | 2366 | 8701 | 11 | 21 | 4 | 2018-05-02 15:32:11 | 2018-11-14 18:34:02 |
| 2 | 2367 | 12120 | 11 | 21 | 4 | 2018-07-23 05:49:02 | 2018-11-25 18:49:01 |
| 3 | 2368 | 11 | 11 | 27 | 3 | 2016-08-01 10:44:05 | 2018-10-22 10:23:31 |
| 4 | 2369 | 50 | 11 | 27 | 3 | 2016-07-25 12:22:38 | 1970-01-01 00:00:01 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2620 | 14061 | 26081 | 183 | 276 | 2 | 2021-08-03 07:34:07 | 2021-11-15 17:37:09 |
| 2621 | 14062 | 26298 | 183 | 276 | 2 | 2021-08-03 07:36:54 | 2021-11-15 18:39:53 |
| 2622 | 14070 | 26443 | 183 | 276 | 2 | 2021-08-03 07:36:57 | 2021-11-16 18:04:44 |
| 2623 | 14076 | 26379 | 183 | 276 | 2 | 2021-08-03 07:36:55 | 2021-11-18 14:35:23 |
| 2624 | 14079 | 26455 | 183 | 276 | 2 | 2021-08-03 07:36:57 | 2021-11-19 13:29:45 |
2625 rows × 7 columns
df_graduates_1 = df_graduates.sort_values(by = "date_enrolled").drop_duplicates(subset=['userid_connect'], keep='last')
df_graduates_1
| id | userid_connect | track_id_connect | badge_id | graduation_type | date_enrolled | date_graduated | |
|---|---|---|---|---|---|---|---|
| 190 | 6577 | 50 | 11 | 27 | 3 | 2016-07-25 12:22:38 | 2018-11-15 14:52:13 |
| 189 | 6576 | 11 | 11 | 27 | 3 | 2016-08-01 10:44:05 | 2018-11-15 14:52:13 |
| 1530 | 12600 | 117 | 84 | 1 | 2 | 2016-08-02 15:53:02 | 2019-01-08 18:44:17 |
| 655 | 9137 | 139 | 11 | 71 | 2 | 2016-08-02 15:57:56 | 2020-04-01 17:34:09 |
| 680 | 10773 | 141 | 11 | 71 | 2 | 2016-08-02 17:01:35 | 2020-10-07 20:47:00 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2555 | 13721 | 26655 | 183 | 275 | 4 | 2021-08-06 08:22:41 | 2021-10-02 01:44:04 |
| 2553 | 13693 | 18665 | 183 | 276 | 2 | 2021-08-07 21:09:53 | 2021-09-28 21:40:03 |
| 2565 | 13776 | 13575 | 183 | 276 | 2 | 2021-08-09 15:01:53 | 2021-10-11 16:09:38 |
| 2619 | 14058 | 26714 | 183 | 276 | 2 | 2021-08-15 17:37:45 | 2021-11-14 18:07:12 |
| 2534 | 13910 | 22031 | 177 | 254 | 4 | 2021-10-28 06:29:02 | 2021-10-28 06:30:07 |
1631 rows × 7 columns
df_grad_1 = df_users.merge(df_graduates_1, on="userid_connect", how="inner")
df_grad_1 = df_grad_1[["shecodes_id","userid_connect", "track_id_connect", "graduation_type",
"date_enrolled", "date_graduated"]]
# Final merge with students from the Python course. Graduates and non-graduates.
df_grad_1 = df_grad_1[["shecodes_id","userid_connect", "track_id_connect", "graduation_type",
"date_enrolled", "date_graduated"]]
df_educ_back = df_grad_1.merge(df_education_fin, on = "shecodes_id", how = "inner")
df_educ_back = df_educ_back[["shecodes_id","userid_connect", "track_id_connect", "graduation_type",
"date_enrolled", "date_graduated", "type_1.0","type_2.0", "type_3.0" ,"type_4.0" ]]
def cat_educ(x):
# High School only
if x["type_1.0"] == 1 and x["type_2.0"] == 0 and x["type_3.0"] == 0 and x["type_4.0"] == 0:
return 1
# Higher education with tech background
elif x["type_3.0"] == 1 and x["type_2.0"] == 1:
return 2
# High school with tech background
elif x["type_1.0"] == 1 and x["type_3.0"] == 0 and x["type_2.0"] == 1:
return 3
# Higher Education only
elif x["type_3.0"] == 1 and x["type_2.0"] == 0 and x["type_4.0"] == 0:
return 4
# Other
else:
return 0
## Situation that we want to show in the graph
df_educ_back["status"] = df_educ_back.apply(lambda x: cat_educ(x), axis = 1)
###TRYING TO DO IT WITH GRAD
#Before, I had in 3 "80% Complete", and in 4: 100% Complete
graduation_type_dict = {2: "Incomplete", 3:"Complete", 4:"Complete"}
df_educ_back["graduation_type"].replace(graduation_type_dict, inplace = True)
status_dict = {0: "Other", 1:"High School only", 2:"Higher education with tech background",
3:"High school with tech background", 4:"Higher Education only "}
df_educ_back["status"].replace(status_dict, inplace = True)
df_educ_back
| shecodes_id | userid_connect | track_id_connect | graduation_type | date_enrolled | date_graduated | type_1.0 | type_2.0 | type_3.0 | type_4.0 | status | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 7 | 11 | Incomplete | 2018-04-08 16:50:46 | 2019-07-02 18:49:20 | 1 | 1 | 1 | 1 | Higher education with tech background |
| 1 | 9 | 11 | 11 | Complete | 2016-08-01 10:44:05 | 2018-11-15 14:52:13 | 1 | 0 | 0 | 0 | High School only |
| 2 | 88 | 90 | 11 | Incomplete | 2018-07-29 11:33:31 | 2020-05-19 07:26:57 | 1 | 0 | 0 | 0 | High School only |
| 3 | 112 | 115 | 144 | Incomplete | 2021-01-27 06:52:49 | 2021-02-20 21:59:00 | 1 | 0 | 1 | 0 | Higher Education only |
| 4 | 114 | 117 | 84 | Incomplete | 2016-08-02 15:53:02 | 2019-01-08 18:44:17 | 1 | 0 | 1 | 0 | Higher Education only |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1488 | 31036 | 26531 | 183 | Incomplete | 2021-08-03 07:36:58 | 2021-11-02 19:50:04 | 1 | 0 | 1 | 0 | Higher Education only |
| 1489 | 31067 | 26113 | 183 | Incomplete | 2021-08-03 07:34:08 | 2021-09-28 13:01:08 | 1 | 0 | 1 | 0 | Higher Education only |
| 1490 | 31071 | 26455 | 183 | Incomplete | 2021-08-03 07:36:57 | 2021-11-19 13:29:45 | 1 | 0 | 1 | 0 | Higher Education only |
| 1491 | 31158 | 26475 | 183 | Complete | 2021-08-03 07:36:57 | 2021-09-26 14:34:04 | 1 | 0 | 1 | 0 | Higher Education only |
| 1492 | 31225 | 26497 | 183 | Incomplete | 2021-08-03 07:36:58 | 2021-10-10 07:48:06 | 1 | 1 | 0 | 0 | High school with tech background |
1493 rows × 11 columns
graduation_level_counts = df_educ_back["status"].value_counts().rename_axis("status2").reset_index(name="counts")
graduation_level_counts["percent"] = (graduation_level_counts["counts"] / graduation_level_counts["counts"].sum()) * 100
graduation_level_counts
| status2 | counts | percent | |
|---|---|---|---|
| 0 | Higher Education only | 809 | 54.186202 |
| 1 | Higher education with tech background | 488 | 32.685867 |
| 2 | High School only | 124 | 8.305425 |
| 3 | Other | 42 | 2.813128 |
| 4 | High school with tech background | 30 | 2.009377 |
fig_11 = graduation_level_counts[["percent"]].plot(kind='barh', title ="SheCodes Users: Education background", figsize=(8, 10),
legend=True, fontsize=12, align='center', color='#86bf91', zorder=2, width=0.85)
sns.set_style("whitegrid", {'axes.grid' : False})
fig_11.set_xlabel("Percent (%)", fontsize=12, weight="bold", color="black")
fig_11.set_ylabel("Education background", fontsize=12, weight="bold", color="black")
fig_11.set_title("SheCodes Users: Education background", fontsize=14, color="black")
fig_11.set_yticklabels(["Higher Education only", "Higher education with tech background", "High School only", "Other",
"High school with tech background"])
fig_11.get_legend().remove()
plt.savefig("fig_11.pdf")
plt.show()
agg_educ = df_educ_back.groupby(['graduation_type', "status"])["shecodes_id"].count().unstack()
sns.set_style("whitegrid", {'axes.grid' : False})
plt.figure(1)
agg_educ.plot(kind='bar', stacked=True)
plt.title('Composition of education by graduation level', fontweight='bold', color = 'black', fontsize='14', horizontalalignment='center')
plt.xticks(rotation=0, ha='center')
plt.xlabel('Graduation Type', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
plt.ylabel("Number of students", fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
plt.legend(loc='upper left')
plt.savefig("fig_6.pdf")
plt.show()
<Figure size 576x396 with 0 Axes>
df_employ_fin = df_employment.merge(df_grad_1, how = "inner", on = "shecodes_id")
df_employ_fin = df_employ_fin[["shecodes_id", "currently_employed", "is_tech_company", "is_tech_position",
"graduation_type", "position_scope"]]
df_employ_fin = df_employ_fin[df_employ_fin["currently_employed"] != 2]
df_employ_fin
| shecodes_id | currently_employed | is_tech_company | is_tech_position | graduation_type | position_scope | |
|---|---|---|---|---|---|---|
| 0 | 129 | 0.0 | NaN | NaN | 3 | None |
| 1 | 12364 | 1.0 | 0.0 | 0.0 | 2 | 4 |
| 2 | 13585 | 0.0 | NaN | 0.0 | 2 | None |
| 3 | 13715 | 0.0 | NaN | 0.0 | 3 | None |
| 4 | 13746 | 1.0 | NaN | 1.0 | 2 | 3 |
| ... | ... | ... | ... | ... | ... | ... |
| 1534 | 9202 | 1.0 | 0.0 | 0.0 | 2 | 3 |
| 1535 | 9122 | 0.0 | 0.0 | 0.0 | 4 | None |
| 1536 | 9112 | 0.0 | 0.0 | 0.0 | 3 | None |
| 1537 | 5197 | 1.0 | 0.0 | 1.0 | 4 | 3 |
| 1538 | 9454 | 0.0 | 0.0 | 0.0 | 4 | None |
1531 rows × 6 columns
graduation_type_dict = {2: "Incomplete", 3:"Complete", 4:"Complete"}
df_employ_fin["graduation_type"].replace(graduation_type_dict, inplace = True)
employ_dict = {0: "Unemployed", 1:"Employed"}
df_employ_fin["currently_employed"].replace(employ_dict, inplace = True)
agg_employ = df_employ_fin.groupby(['graduation_type', "currently_employed"])["shecodes_id"].count().unstack().fillna(0)
sns.set_style("whitegrid", {'axes.grid' : False})
plt.figure()
plt.rcParams["figure.figsize"] = [6.00, 8.00]
agg_employ.plot(kind='bar', stacked=True)
plt.title('Composition of employment level by graduation type', fontweight='bold', color = 'black', fontsize='14', horizontalalignment='center')
plt.xticks(rotation=0, ha='center')
plt.xlabel('Graduation Type', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
plt.ylabel("Number of students", fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
plt.legend(loc='upper left')
plt.savefig("fig_7.pdf")
plt.show()
<Figure size 576x396 with 0 Axes>
df_employ_fin
| shecodes_id | currently_employed | is_tech_company | is_tech_position | graduation_type | position_scope | |
|---|---|---|---|---|---|---|
| 0 | 129 | Unemployed | NaN | NaN | Complete | None |
| 1 | 12364 | Employed | 0.0 | 0.0 | Incomplete | 4 |
| 2 | 13585 | Unemployed | NaN | 0.0 | Incomplete | None |
| 3 | 13715 | Unemployed | NaN | 0.0 | Complete | None |
| 4 | 13746 | Employed | NaN | 1.0 | Incomplete | 3 |
| ... | ... | ... | ... | ... | ... | ... |
| 1534 | 9202 | Employed | 0.0 | 0.0 | Incomplete | 3 |
| 1535 | 9122 | Unemployed | 0.0 | 0.0 | Complete | None |
| 1536 | 9112 | Unemployed | 0.0 | 0.0 | Complete | None |
| 1537 | 5197 | Employed | 0.0 | 1.0 | Complete | 3 |
| 1538 | 9454 | Unemployed | 0.0 | 0.0 | Complete | None |
1531 rows × 6 columns
df_employ_fin_counts = df_employ_fin["currently_employed"].value_counts().rename_axis("currently_employed2").reset_index(name="counts")
df_employ_fin_counts["percent"] = (df_employ_fin_counts["counts"] / df_employ_fin_counts["counts"].sum()) * 100
df_employ_fin_counts
| currently_employed2 | counts | percent | |
|---|---|---|---|
| 0 | Unemployed | 867 | 58.187919 |
| 1 | Employed | 623 | 41.812081 |
my_colors=["lightblue","lightgreen",
"silver","green"]
fig_12 = df_employ_fin_counts.groupby(["currently_employed2"]).sum().plot(kind="pie",
y="percent",
figsize=(6,6),colors=my_colors,
autopct="%1.1f%%",
explode=(0,0.15),
labels= ["Employed", "Unemployed"],
ylabel="",
title = "SheCodes Users: Employment Status")
fig_12.figure.savefig("fig_12.pdf")
fig_12
<AxesSubplot:title={'center':'SheCodes Users: Employment Status'}>
df_employ_fin["position_scope"] = pd.to_numeric(df_employ_fin["position_scope"])
position_scope_dict = {1: "Part-time (less than 50%)",
2:"Part-time (over 50%)",
3: "Full-time (over35 hours per week)",
4: "Partially employed",
5: "Freelance / Independent"}
df_employ_fin["position_scope"].replace(position_scope_dict, inplace = True)
df_employ_fin
| shecodes_id | currently_employed | is_tech_company | is_tech_position | graduation_type | position_scope | |
|---|---|---|---|---|---|---|
| 0 | 129 | Unemployed | NaN | NaN | Complete | NaN |
| 1 | 12364 | Employed | 0.0 | 0.0 | Incomplete | Partially employed |
| 2 | 13585 | Unemployed | NaN | 0.0 | Incomplete | NaN |
| 3 | 13715 | Unemployed | NaN | 0.0 | Complete | NaN |
| 4 | 13746 | Employed | NaN | 1.0 | Incomplete | Full-time (over35 hours per week) |
| ... | ... | ... | ... | ... | ... | ... |
| 1534 | 9202 | Employed | 0.0 | 0.0 | Incomplete | Full-time (over35 hours per week) |
| 1535 | 9122 | Unemployed | 0.0 | 0.0 | Complete | NaN |
| 1536 | 9112 | Unemployed | 0.0 | 0.0 | Complete | NaN |
| 1537 | 5197 | Employed | 0.0 | 1.0 | Complete | Full-time (over35 hours per week) |
| 1538 | 9454 | Unemployed | 0.0 | 0.0 | Complete | NaN |
1531 rows × 6 columns
agg_employ_2 = df_employ_fin.groupby(['graduation_type', "position_scope"])["shecodes_id"].count().unstack().fillna(0)
sns.set_style("whitegrid", {'axes.grid' : False})
plt.figure()
plt.rcParams["figure.figsize"] = [6.00, 8.00]
agg_employ_2.plot(kind='bar', stacked=True)
plt.title('Composition by position scope', fontweight='bold', color = 'black', fontsize='14', horizontalalignment='center')
plt.xticks(rotation=0, ha='center')
plt.xlabel('Graduation Type', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
plt.ylabel("Number of students", fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
plt.legend(loc='upper left')
plt.savefig("fig_8.pdf")
plt.show()
<Figure size 432x576 with 0 Axes>
df_employ_fin_counts2 = df_employ_fin["position_scope"].value_counts().rename_axis("df_employ_fin2").reset_index(name="counts")
df_employ_fin_counts2["percent"] = (df_employ_fin_counts2["counts"] / df_employ_fin_counts2["counts"].sum()) * 100
df_employ_fin_counts2
| df_employ_fin2 | counts | percent | |
|---|---|---|---|
| 0 | Full-time (over35 hours per week) | 597 | 70.152761 |
| 1 | Part-time (over 50%) | 88 | 10.340776 |
| 2 | Part-time (less than 50%) | 73 | 8.578143 |
| 3 | Partially employed | 70 | 8.225617 |
| 4 | Freelance / Independent | 23 | 2.702703 |
fig_13 = df_employ_fin_counts2[["percent"]].plot(kind='barh', title ="SheCodes Users: Employment Type", figsize=(8, 10),
legend=True, fontsize=12, align='center', color='#86bf91', zorder=2, width=0.85)
sns.set_style("whitegrid", {'axes.grid' : False})
fig_13.set_xlabel("Percent (%)", fontsize=12, weight="bold", color="black")
fig_13.set_ylabel("Employment Type", fontsize=12, weight="bold", color="black")
fig_13.set_title("SheCodes Users: Employment Type", fontsize=14, color="black")
fig_13.set_yticklabels(["Full-time >35 hours per week", "Part-time >50%", "Part-time <50%", "Partially employed",
"Freelance / Independent"])
fig_13.get_legend().remove()
plt.savefig("fig_13.pdf")
plt.show()
is_tech_dict = {0: "Non-tech Company", 1:"Tech Company"}
df_employ_fin["is_tech_company"].replace(is_tech_dict, inplace = True)
df_employ_fin
| shecodes_id | currently_employed | is_tech_company | is_tech_position | graduation_type | position_scope | |
|---|---|---|---|---|---|---|
| 0 | 129 | Unemployed | NaN | NaN | Complete | NaN |
| 1 | 12364 | Employed | Non-tech Company | 0.0 | Incomplete | Partially employed |
| 2 | 13585 | Unemployed | NaN | 0.0 | Incomplete | NaN |
| 3 | 13715 | Unemployed | NaN | 0.0 | Complete | NaN |
| 4 | 13746 | Employed | NaN | 1.0 | Incomplete | Full-time (over35 hours per week) |
| ... | ... | ... | ... | ... | ... | ... |
| 1534 | 9202 | Employed | Non-tech Company | 0.0 | Incomplete | Full-time (over35 hours per week) |
| 1535 | 9122 | Unemployed | Non-tech Company | 0.0 | Complete | NaN |
| 1536 | 9112 | Unemployed | Non-tech Company | 0.0 | Complete | NaN |
| 1537 | 5197 | Employed | Non-tech Company | 1.0 | Complete | Full-time (over35 hours per week) |
| 1538 | 9454 | Unemployed | Non-tech Company | 0.0 | Complete | NaN |
1531 rows × 6 columns
df_employ_fin_counts3 = df_employ_fin["is_tech_company"].value_counts().rename_axis("is_tech_company2").reset_index(name="counts")
df_employ_fin_counts3["percent"] = (df_employ_fin_counts3["counts"] / df_employ_fin_counts3["counts"].sum()) * 100
df_employ_fin_counts3
| is_tech_company2 | counts | percent | |
|---|---|---|---|
| 0 | Non-tech Company | 594 | 67.5 |
| 1 | Tech Company | 286 | 32.5 |
my_colors=["lightblue","lightgreen",
"silver","green"]
fig_14 = df_employ_fin_counts3.groupby(["is_tech_company2"]).sum().plot(kind="pie",
y="percent",
figsize=(6,6),colors=my_colors,
autopct="%1.1f%%",
explode=(0,0.15),
labels= ["Non-tech Company", "Tech Company"],
ylabel="",
title = "SheCodes Users: Employment type")
fig_14.figure.savefig("fig_14.pdf")
fig_14
<AxesSubplot:title={'center':'SheCodes Users: Employment type'}>
agg_employ_3 = df_employ_fin.groupby(['graduation_type', "is_tech_company"])["shecodes_id"].count().unstack().fillna(0)
sns.set_style("whitegrid", {'axes.grid' : False})
plt.figure()
plt.rcParams["figure.figsize"] = [6.00, 8.00]
agg_employ_3.plot(kind='bar', stacked=True)
plt.title('Is the student working in a tech company?', fontweight='bold', color = 'black', fontsize='14', horizontalalignment='center')
plt.xticks(rotation=0, ha='center')
plt.xlabel('Graduation Type', fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
plt.ylabel("Number of students", fontweight='bold', color = 'black', fontsize='12', horizontalalignment='center')
plt.legend(loc='upper left')
plt.savefig("fig_9.pdf")
plt.show()
<Figure size 432x576 with 0 Axes>
df_educ_back_counts = df_educ_back["graduation_type"].value_counts().rename_axis("graduation_type").reset_index(name="counts")
df_educ_back_counts["percent"] = (df_educ_back_counts["counts"] /
df_educ_back_counts["counts"].sum())*100
df_educ_back_counts
| graduation_type | counts | percent | |
|---|---|---|---|
| 0 | Incomplete | 990 | 66.309444 |
| 1 | Complete | 503 | 33.690556 |
my_colors=["lightblue","lightgreen",
"silver","green"]
fig_10 = df_educ_back_counts.groupby(["graduation_type"]).sum().plot(kind="pie",
y="percent",
figsize=(6,6),colors=my_colors,
autopct="%1.1f%%",
explode=(0,0.15),
labels= ["Complete", "Incomplete"],
ylabel="",
title = "Percentage of users who complete the program")
fig_10.figure.savefig("fig_10.pdf")
fig_10
<AxesSubplot:title={'center':'Percentage of users who complete the program'}>